Loading libraries

library(dplyr)
library(ggplot2)
library(plotly)
library(tidyr)
library(knitr)
library(corrplot)

Reading data

data <- read.csv2('./all_summary.csv', nrows = 10000)
dim(data)
## [1] 10000   412

Processing missing data

required_columns <- c("res_name", "blob_volume_coverage", "blob_volume_coverage_second", "skeleton_density", "local_res_atom_non_h_count", "local_res_atom_non_h_electron_sum", "dict_atom_non_h_count", "dict_atom_non_h_electron_sum")
dim(data)
## [1] 10000   412
data <- data %>% 
  select(one_of(required_columns), contains("part_01")) %>%
  drop_na()
dim(data)
## [1] 9580  114

Deleting chosen ligands

deletable_res_name <- c("UNK", "UNX", "UNL", "DUM", "N", "BLOB", "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE", "LEU", "LYS", "MET", "MSE", "PHE", "PRO", "SEC", "SER", "THR", "TRP", "TYR", "VAL", "DA", "DG", "DT", "DC", "DU", "A", "G", "T", "C", "U", "HOH", "H20", "WAT")
data <- data %>% filter(!res_name %in% deletable_res_name)
dim(data)
## [1] 9522  114

Data summary

statistics <- data %>%
  select(res_name, blob_volume_coverage, blob_volume_coverage_second, skeleton_density)
kable(summary(statistics))
res_name blob_volume_coverage blob_volume_coverage_second skeleton_density
SO4 :1005 1 : 130 0 :8024 0 :1088
GOL : 629 0.8571428571: 6 0.0243902439 : 2 1 : 944
EDO : 512 0.8333333333: 5 0.02523659306: 2 0.6666666667: 511
NAG : 452 0.8461538462: 5 0.0200661832 : 1 0.5 : 279
CL : 387 0.3266490765: 4 0.02009536785: 1 0.1666666667: 225
DMS : 340 0.75 : 4 0.02016883762: 1 0.1538461538: 220
(Other):6197 (Other) :9368 (Other) :1491 (Other) :6255
dim(data)
## [1] 9522  114

Cardinality of ligands by name

plot_ligands <- ggplot(popular_ligands, aes(x = reorder(res_name, -n), y = n, fill = n)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 90)) +
  xlab("ligand")+
  labs(title = "Cardinality of ligands by name")

ggplotly(plot_ligands)

Correlation between variables

data %>%
  select_if(is.numeric) %>%
  cor %>%
  corrplot(type = "lower", tl.col = "black", tl.srt = 45)

Distribution of atom and electron count

plot_atom <- ggplot(data, aes(x = local_res_atom_non_h_count)) + 
  geom_density(alpha = .3, fill = "#00CECB", color = NA) +
  xlab("atom count") +
  labs(title = "Atom count distribution")

ggplotly(plot_atom)
plot_electron <- ggplot(data, aes(x = local_res_atom_non_h_electron_sum)) + 
  geom_density(alpha = .3, fill = "#FF5E5B", color = NA) +
  xlab("electron count") +
  labs(title = "Electron count distribution")

ggplotly(plot_electron)

Distribution of part_01 columns

plot_part_data <- data %>%
  select(contains("part_01")) %>%
  gather(part, value, 1:106)
## Warning: attributes are not identical across measure variables;
## they will be dropped
dim(plot_part_data)
## [1] 700342      2
# 
# plot_part_data_1 <- plot_part_data[1:118926,]
# plot_part_data_2 <- plot_part_data[118927:237852,]
# plot_part_data_3 <- plot_part_data[237853:356778,]
# plot_part_data_4 <- plot_part_data[356779:475704,]
# plot_part_data_5 <- plot_part_data[475705:594630,]
# plot_part_data_6 <- plot_part_data[594631:700342,]
# 
# plot_ly(plot_part_data_1, x = plot_part_data_1$part, y = plot_part_data_1$value, type = 'box')
# plot_ly(plot_part_data_2, x = plot_part_data_2$part, y = plot_part_data_2$value, type = 'box')
# plot_ly(plot_part_data_3, x = plot_part_data_3$part, y = plot_part_data_3$value, type = 'box')
# plot_ly(plot_part_data_4, x = plot_part_data_4$part, y = plot_part_data_4$value, type = 'box')
# plot_ly(plot_part_data_5, x = plot_part_data_5$part, y = plot_part_data_5$value, type = 'box')
# plot_ly(plot_part_data_6, x = plot_part_data_6$part, y = plot_part_data_6$value, type = 'box')

Greatest inconsistency in classes

Atom

data %>%
  select(res_name, local_res_atom_non_h_count, dict_atom_non_h_count) %>%
  group_by(res_name) %>%
  summarise(atom_inconsistency = mean(abs(local_res_atom_non_h_count - dict_atom_non_h_count))) %>%
  arrange(-atom_inconsistency) %>%
  slice(1:10) %>%
  kable()
res_name atom_inconsistency
PLC 16.7500000
LHG 5.9855072
C8E 2.3714286
NDP 1.6956522
PG4 1.4675325
NAP 1.4067797
MLY 1.2000000
CME 1.0000000
MAN 1.0000000
NAG 0.9889381

Electron

data %>%
  select(res_name, local_res_atom_non_h_electron_sum, dict_atom_non_h_electron_sum) %>%
  group_by(res_name) %>%
  summarise(electron_inconsistency = mean(abs(local_res_atom_non_h_electron_sum - dict_atom_non_h_electron_sum))) %>%
  arrange(-electron_inconsistency) %>%
  slice(1:10) %>%
  kable()
res_name electron_inconsistency
PLC 111.875000
LHG 45.304348
C8E 14.971429
NDP 11.086956
NAP 9.932203
PG4 9.922078
MLY 9.233333
CME 8.000000
MAN 8.000000
NAG 7.911504